package com.feiyuan.book.webmagic.bqg.processor;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

public class BookContentProcessor implements PageProcessor {

    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setCharset("UTF-8");

    @Override
    public void process(Page page) {
        String content = page.getHtml().xpath("//div[@class='content']//div[@id='chaptercontent']").get();
        content = content.replace("<div id=\"chaptercontent\" class=\"Readarea ReadAjax_content\">", "").replace("</div>", "");
        String[] contents = content.split("<br>");
        StringBuilder sb = new StringBuilder();
        for (String c : contents) {
            if (!c.contains("请收藏本站") && !c.contains("<p")) {
                sb.append("<p>").append(c).append("</p>");
            }
        }
        page.putField("content", sb.toString());
    }

    @Override
    public Site getSite() {
        return site;
    }
}
